In [ ]:
#Image Captioning
In [ ]:
import os   # handling the files
import pickle # storing numpy features
import numpy as np
from tqdm.notebook import tqdm # how much data is process till now
from tensorflow.keras.applications.vgg16 import VGG16 , preprocess_input # extract features from image data.
from tensorflow.keras.preprocessing.image import load_img , img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input , Dense , LSTM , Embedding , Dropout , add
In [ ]:
BASE_DIR = 'E:\Masters\Image_captioning'
WORKING_DIR = 'E:\Masters\Image_captioning'
In [ ]:
# Load vgg16 Model
model = VGG16()

# restructure model
model = Model(inputs = model.inputs , outputs = model.layers[-2].output)

# Summerize
print(model.summary())
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0         
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0         
                                                                 
 block3_conv1 (Conv2D)       (None, 56, 56, 256)       295168    
                                                                 
 block3_conv2 (Conv2D)       (None, 56, 56, 256)       590080    
                                                                 
 block3_conv3 (Conv2D)       (None, 56, 56, 256)       590080    
                                                                 
 block3_pool (MaxPooling2D)  (None, 28, 28, 256)       0         
                                                                 
 block4_conv1 (Conv2D)       (None, 28, 28, 512)       1180160   
                                                                 
 block4_conv2 (Conv2D)       (None, 28, 28, 512)       2359808   
                                                                 
 block4_conv3 (Conv2D)       (None, 28, 28, 512)       2359808   
                                                                 
 block4_pool (MaxPooling2D)  (None, 14, 14, 512)       0         
                                                                 
 block5_conv1 (Conv2D)       (None, 14, 14, 512)       2359808   
                                                                 
 block5_conv2 (Conv2D)       (None, 14, 14, 512)       2359808   
                                                                 
 block5_conv3 (Conv2D)       (None, 14, 14, 512)       2359808   
                                                                 
 block5_pool (MaxPooling2D)  (None, 7, 7, 512)         0         
                                                                 
 flatten (Flatten)           (None, 25088)             0         
                                                                 
 fc1 (Dense)                 (None, 4096)              102764544 
                                                                 
 fc2 (Dense)                 (None, 4096)              16781312  
                                                                 
=================================================================
Total params: 134,260,544
Trainable params: 134,260,544
Non-trainable params: 0
_________________________________________________________________
None
In [ ]:
# extract features from image
features = {}
directory = os.path.join(BASE_DIR, 'Images')

for img_name in tqdm(os.listdir(directory)):
    # load the image from file
    img_path = directory + '/' + img_name
    image = load_img(img_path, target_size=(224, 224))
    # convert image pixels to numpy array
    image = img_to_array(image)
    # reshape data for model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    # preprocess image for vgg
    image = preprocess_input(image)
    # extract features
    feature = model.predict(image, verbose=0)
    # get image ID
    image_id = img_name.split('.')[0]
    # store feature
    features[image_id] = feature
  0%|          | 0/8092 [00:00<?, ?it/s]
In [ ]:
# store features in pickle
pickle.dump(features, open(os.path.join(WORKING_DIR, 'features.pkl'), 'wb'))
In [ ]:
# load features from pickle
with open(os.path.join(WORKING_DIR, 'features.pkl'), 'rb') as f:
    features = pickle.load(f)
In [ ]:
with open(os.path.join(BASE_DIR, 'captions.txt'), 'r') as f:
    next(f)
    captions_doc = f.read()
In [ ]:
# create mapping of image to captions
mapping = {}
# process lines
for line in tqdm(captions_doc.split('\n')):
    # split the line by comma(,)
    tokens = line.split(',')
    if len(line) < 2:
        continue
    image_id, caption = tokens[0], tokens[1:]
    # remove extension from image ID
    image_id = image_id.split('.')[0]
    # convert caption list to string
    caption = " ".join(caption)
    # create list if needed
    if image_id not in mapping:
        mapping[image_id] = []
    # store the caption
    mapping[image_id].append(caption)
  0%|          | 0/40456 [00:00<?, ?it/s]
In [ ]:
len(mapping)
Out[ ]:
8091

Preprocess Text Data

In [ ]:
def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            # take one caption at a time
            caption = captions[i]
            # preprocessing steps
            # convert to lowercase
            caption = caption.lower()
            # delete digits, special chars, etc., 
            caption = caption.replace('[^A-Za-z]', '')
            # delete additional spaces
            caption = caption.replace('\s+', ' ')
            # add start and end tags to the caption
            caption = 'startseq ' + " ".join([word for word in caption.split() if len(word)>1]) + ' endseq'
            captions[i] = caption    
In [ ]:
mapping['1000268201_693b08cb0e']
Out[ ]:
['A child in a pink dress is climbing up a set of stairs in an entry way .',
 'A girl going into a wooden building .',
 'A little girl climbing into a wooden playhouse .',
 'A little girl climbing the stairs to her playhouse .',
 'A little girl in a pink dress going into a wooden cabin .']

It has some bleeding puntucation so we are cleaning the data by using the clean funtion

In [ ]:
clean(mapping)
In [ ]:
mapping['1000268201_693b08cb0e']
Out[ ]:
['startseq child in pink dress is climbing up set of stairs in an entry way endseq',
 'startseq girl going into wooden building endseq',
 'startseq little girl climbing into wooden playhouse endseq',
 'startseq little girl climbing the stairs to her playhouse endseq',
 'startseq little girl in pink dress going into wooden cabin endseq']

Here we can see the output without the punctuation

Now We are storing the captions in the list from dict

In [ ]:
all_captions = []
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)
In [ ]:
len(all_captions)
Out[ ]:
40455
In [ ]:
#printing the 10 captions
all_captions[:10]
Out[ ]:
['startseq child in pink dress is climbing up set of stairs in an entry way endseq',
 'startseq girl going into wooden building endseq',
 'startseq little girl climbing into wooden playhouse endseq',
 'startseq little girl climbing the stairs to her playhouse endseq',
 'startseq little girl in pink dress going into wooden cabin endseq',
 'startseq black dog and spotted dog are fighting endseq',
 'startseq black dog and tri-colored dog playing with each other on the road endseq',
 'startseq black dog and white dog with brown spots are staring at each other in the street endseq',
 'startseq two dogs of different breeds looking at each other on the road endseq',
 'startseq two dogs on pavement moving toward each other endseq']
In [ ]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) +1
print(vocab_size)
8485
In [ ]:
# get maximum length of the caption available
max_length = max(len(caption.split()) for caption in all_captions)
max_length
Out[ ]:
35

Train and test split

In [ ]:
image_ids = list(mapping.keys())
split = int(len(image_ids) * 0.80)
train = image_ids[:split]
test = image_ids[split:]
In [ ]:
# create data generator to get data in batch (avoids session crash)
def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
    # loop over images
    X1, X2, y = list(), list(), list()
    n = 0
    while 1:
        for key in data_keys:
            n += 1
            captions = mapping[key]
            # process each caption
            for caption in captions:
                # encode the sequence
                seq = tokenizer.texts_to_sequences([caption])[0]
                # split the sequence into X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pairs
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq],num_classes=vocab_size)[0]
                    # store the sequences
                    X1.append(features[key][0])
                    X2.append(in_seq)
                    y.append(out_seq)
            if n == batch_size:
                X1, X2, y = np.array(X1), np.array(X2), np.array(y)
                yield [X1, X2], y
                X1, X2, y = list(), list(), list()
                n = 0
In [ ]:
epochs = 20
batch_size = 32
steps = len(train) // batch_size
In [ ]:
# encoder model
# image feature layers
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.4)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
# sequence feature layers
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)

# decoder model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

# plot the model
plot_model(model, show_shapes=True)
You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.
In [ ]:
# train the model
epochs = 20 
batch_size = 32
steps = len(train) // batch_size

for i in range(epochs):
    # create data generator
    generator = data_generator(train, mapping, features, tokenizer, max_length, vocab_size, batch_size)
    # fit for one epoch
    model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)
202/202 [==============================] - 27s 119ms/step - loss: 5.3098
202/202 [==============================] - 26s 131ms/step - loss: 4.0657
202/202 [==============================] - 29s 142ms/step - loss: 3.6080
202/202 [==============================] - 32s 160ms/step - loss: 3.3343
202/202 [==============================] - 53s 265ms/step - loss: 3.1329
202/202 [==============================] - 56s 276ms/step - loss: 2.9824
202/202 [==============================] - 58s 289ms/step - loss: 2.8672
202/202 [==============================] - 59s 290ms/step - loss: 2.7696
202/202 [==============================] - 49s 242ms/step - loss: 2.6855
202/202 [==============================] - 63s 313ms/step - loss: 2.6112
202/202 [==============================] - 52s 257ms/step - loss: 2.5450
202/202 [==============================] - 60s 298ms/step - loss: 2.4938
202/202 [==============================] - 61s 302ms/step - loss: 2.4457
202/202 [==============================] - 39s 192ms/step - loss: 2.3967
202/202 [==============================] - 55s 270ms/step - loss: 2.3514
202/202 [==============================] - 60s 294ms/step - loss: 2.3114
202/202 [==============================] - 34s 170ms/step - loss: 2.2728
202/202 [==============================] - 44s 217ms/step - loss: 2.2399
202/202 [==============================] - 49s 244ms/step - loss: 2.2051
202/202 [==============================] - 65s 324ms/step - loss: 2.1774
In [ ]:
# save the model
model.save(WORKING_DIR+'/best_model.h5')
In [ ]:
def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None 
In [ ]:
# generate caption for an image
def predict_caption(model, image, tokenizer, max_length):
    # add start tag for generation process
    in_text = 'startseq'
    # iterate over the max length of sequence
    for i in range(max_length):
        # encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad the sequence
        sequence = pad_sequences([sequence], max_length)
        # predict next word
        yhat = model.predict([image, sequence], verbose=0)
        # get index with high probability
        yhat = np.argmax(yhat)
        # convert index to word
        word = idx_to_word(yhat, tokenizer)
        # stop if word not found
        if word is None:
            break
        # append word as input for generating next word
        in_text += " " + word
        # stop if we reach end tag
        if word == 'endseq':
            break
    return in_text
In [ ]:
from nltk.translate.bleu_score import corpus_bleu
# validate with test data
actual, predicted = list(), list()

for key in tqdm(test):
    # get actual caption
    captions = mapping[key]
    # predict the caption for image
    y_pred = predict_caption(model, features[key], tokenizer, max_length)
    # split into words
    actual_captions = [caption.split() for caption in captions]
    y_pred = y_pred.split()
    # append to the list
    actual.append(actual_captions) 
    predicted.append(y_pred)
# calcuate BLEU score
print("BLEU-1: %f" % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
print("BLEU-2: %f" % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
  0%|          | 0/1619 [00:00<?, ?it/s]
BLEU-1: 0.537607
BLEU-2: 0.309126
BLEU-2: 0.309126
In [ ]:
from PIL import Image
import matplotlib.pyplot as plt
def generate_caption(image_name):
    # load the image
    # image_name = "1001773457_577c3a7d70.jpg"
    image_id = image_name.split('.')[0]
    img_path = os.path.join(BASE_DIR, "Images", image_name)
    image = Image.open(img_path)
    captions = mapping[image_id]
    print('---------------------Actual---------------------')
    for caption in captions:
        print(caption)
    # predict the caption
    y_pred = predict_caption(model, features[image_id], tokenizer, max_length)
    print('--------------------Predicted--------------------') 
    print(y_pred)
    plt.imshow(image)
In [ ]:
generate_caption("1001773457_577c3a7d70.jpg")
---------------------Actual---------------------
startseq black dog and spotted dog are fighting endseq
startseq black dog and tri-colored dog playing with each other on the road endseq
startseq black dog and white dog with brown spots are staring at each other in the street endseq
startseq two dogs of different breeds looking at each other on the road endseq
startseq two dogs on pavement moving toward each other endseq
--------------------Predicted--------------------
startseq two dogs are playing with toy in the grass endseq
In [ ]:
generate_caption("1002674143_1b742ab4b8.jpg")
---------------------Actual---------------------
startseq little girl covered in paint sits in front of painted rainbow with her hands in bowl endseq
startseq little girl is sitting in front of large painted rainbow endseq
startseq small girl in the grass plays with fingerpaints in front of white canvas with rainbow on it endseq
startseq there is girl with pigtails sitting in front of rainbow painting endseq
startseq young girl with pigtails painting outside in the grass endseq
--------------------Predicted--------------------
startseq woman in purple dress is sitting in the grass with rainbow painting in the background endseq
In [ ]:
generate_caption("101669240_b2d3e7f17b.jpg")
---------------------Actual---------------------
startseq man in hat is displaying pictures next to skier in blue hat endseq
startseq man skis past another man displaying paintings in the snow endseq
startseq person wearing skis looking at framed pictures set up in the snow endseq
startseq skier looks at framed pictures in the snow next to trees endseq
startseq man on skis looking at artwork for sale in the snow endseq
--------------------Predicted--------------------
startseq skier in blue coat is displaying pictures in the snow endseq
In [ ]:
generate_caption("49553964_cee950f3ba.jpg")
---------------------Actual---------------------
startseq man holding onto ropes while boogie boarding endseq
startseq man holds onto ropes and is pulled through the water on his ski endseq
startseq man rides wakeboard attached to parachute endseq
startseq man windsurfing endseq
startseq the man is waterskiing endseq
--------------------Predicted--------------------
startseq person in yellow and blue is riding on the water with buildings in the background endseq
In [ ]:
model.predict
Out[ ]:
<bound method Model.predict of <keras.engine.functional.Functional object at 0x0000029BF5C4DDE0>>

Image Manipulation the testing for model performance¶

In [ ]:
generate_caption("49553964_cee950f3ba.jpg") 
---------------------Actual---------------------
startseq man holding onto ropes while boogie boarding endseq
startseq man holds onto ropes and is pulled through the water on his ski endseq
startseq man rides wakeboard attached to parachute endseq
startseq man windsurfing endseq
startseq the man is waterskiing endseq
--------------------Predicted--------------------
startseq person in yellow and blue is riding on the water with buildings in the background endseq

Test methodologies which am going to take effect is Assignment wise, am gonna do image manipulation which the model will go crazy. Let start playing.¶

Test¶

In [ ]:
import cv2
import os
BASE_DIR = 'E:\Masters\Image_captioning'

def blur_flickr8k_images(src_dir, dst_dir):
    """Blur images in Flickr8k dataset

    Args:
        src_dir (str): Directory containing original images 
        dst_dir (str): Directory to save blurred images
    """ 
    
    if not os.path.exists(dst_dir):
        os.makedirs(dst_dir)
        
    for img_file in os.listdir(src_dir):
        img_path = os.path.join(src_dir, img_file)
        img = cv2.imread(img_path)
        
        blurred = cv2.GaussianBlur(img, (51,51), 0)
        
        blur_path = os.path.join(dst_dir, img_file)
        cv2.imwrite(blur_path, blurred)
In [ ]:
from PIL import Image
import matplotlib.pyplot as plt
BASE_DIR_cust = 'E:\Masters\Image_captioning' 
def generate_caption_cust(image_name): 
    # load the image
    # image_name = "1001773457_577c3a7d70.jpg"
    image_id = image_name.split('.')[0] 
    img_path = os.path.join(BASE_DIR_cust, "Cust", image_name) 
    print(img_path) 
    image = Image.open(img_path)
    plt.show(image)
    captions = mapping[image_id] 
    print('---------------------Actual---------------------') 
    for caption in captions:
        print(caption) 
    # predict the caption
    y_pred = predict_caption(model, features[image_id], tokenizer, max_length) 
    print('--------------------Predicted--------------------') 
    print(y_pred)
    plt.imshow(image)
In [ ]:
from PIL import Image
from skimage import transform
import numpy as np
import matplotlib.pyplot as plt
import skimage.io as io
import cv2
import skimage

def read_image(img):
    img_path = os.path.join(BASE_DIR, "Images", img)
    image = cv2.imread(img_path)
    return image

def perform_test(test):
    threshold(test)
    generate_caption_cust(test)
    restoration_resolution(test)
    generate_caption_cust(test)
    negative(test)
    generate_caption(test)
    return

def write(img, img_nam):
    output_path = os.path.join(BASE_DIR_cust, "Cust", img_nam)
    cv2.imwrite(output_path, img)

#Test - 1, thresholding the image.
def threshold(image_):
    image = read_image(image_)
    blue_channel, green_channel, red_channel = cv2.split(image)
    threshold_value = 128
    _, blue_thresholded = cv2.threshold(blue_channel, threshold_value, 255, cv2.THRESH_BINARY)
    _, green_thresholded = cv2.threshold(green_channel, threshold_value, 255, cv2.THRESH_BINARY)
    _, red_thresholded = cv2.threshold(red_channel, threshold_value, 255, cv2.THRESH_BINARY)
    combined_thresholded_image = cv2.merge([blue_thresholded, green_thresholded, red_thresholded])
    write(combined_thresholded_image, image_)

#Test - 2, Restoration 
def restoration_resolution(img_):
    img = read_image(img_)
    new_resolution = (12, 12)
    resized_image = cv2.resize(img, new_resolution)
    restored_image = cv2.resize(resized_image, (img.shape[1], img.shape[0]))
    write(restored_image, img_)
#Test - 3, Negative.

def negative(img_):
    img = read_image(img_)
    col_neg = abs(255-img)
    write(col_neg, img_)
name = "{}.jpg".format(test[0])
perform_test(name)
3601569729_bf4bf82768.jpg
E:\Masters\Image_captioning\Cust\3601569729_bf4bf82768.jpg
---------------------Actual---------------------
startseq group of race horses run down track carrying jockeys endseq
startseq horse race endseq
startseq jockeys on horses during race endseq
startseq the horses race on the dirt track while their riders urge them on endseq
startseq "there are riders and horses in horse race going around track ." endseq
--------------------Predicted--------------------
startseq pack of horses and horses in the dirt endseq
E:\Masters\Image_captioning\Cust\3601569729_bf4bf82768.jpg
---------------------Actual---------------------
startseq group of race horses run down track carrying jockeys endseq
startseq horse race endseq
startseq jockeys on horses during race endseq
startseq the horses race on the dirt track while their riders urge them on endseq
startseq "there are riders and horses in horse race going around track ." endseq
--------------------Predicted--------------------
startseq pack of horses and horses in the dirt endseq
---------------------Actual---------------------
startseq group of race horses run down track carrying jockeys endseq
startseq horse race endseq
startseq jockeys on horses during race endseq
startseq the horses race on the dirt track while their riders urge them on endseq
startseq "there are riders and horses in horse race going around track ." endseq
--------------------Predicted--------------------
startseq pack of horses and horses in the dirt endseq
In [ ]:
 
3601569729_bf4bf82768